/////////////////////////////////////////////////////////////////
// Sequence.h
//
// Class for reading/manipulating single sequence character data.
/////////////////////////////////////////////////////////////////

#ifndef SEQUENCE_H
#define SEQUENCE_H

#include <string>
#include <fstream>
#include <iostream>
#include <cctype>
#include <cstdlib>
#include "SafeVector.h"
#include "FileBuffer.h"

/////////////////////////////////////////////////////////////////
// Sequence
//
// Class for storing sequence information.
/////////////////////////////////////////////////////////////////

class Sequence
{

    bool isValid; // a boolean indicating whether the sequence data is valid or not
    string header;       // string containing the comment line of the FASTA file
    SafeVector<char> *data;      // pointer to character data
    int length;                  // length of the sequence
    int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences
    //   in a Multi-FASTA file
    int inputLabel;              // position of sequence in original input

    /////////////////////////////////////////////////////////////////
    // Sequence::Sequence()
    //
    // Default constructor.  Does nothing.
    /////////////////////////////////////////////////////////////////

    Sequence() :
        isValid(false), header(""), data(NULL), length(0), sequenceLabel(0), inputLabel(
            0)
    {
    }

public:

    /////////////////////////////////////////////////////////////////
    // Sequence::Sequence()
    //
    // Constructor.  Reads the sequence from a FileBuffer.
    /////////////////////////////////////////////////////////////////

    Sequence(FileBuffer &infile, bool stripGaps = false) :
        isValid(false), header("~"), data(NULL), length(0), sequenceLabel(
            0), inputLabel(0)
    {

        // read until the first non-blank line
        while (!infile.eof())
        {
            infile.GetLine(header);
            if (header.length() != 0)
                break;
        }

        // check to make sure that it is a correct header line
        if (header[0] == '>')
        {

            // if so, remove the leading ">"
            header = header.substr(1);

            // remove any leading or trailing white space in the header comment
            while (header.length() > 0 && isspace(header[0]))
                header = header.substr(1);
            while (header.length() > 0 && isspace(header[header.length() - 1]))
                header = header.substr(0, header.length() - 1);

            // get ready to read the data[] array; note that data[0] is always '@'
            char ch;
            data = new SafeVector<char>;
            assert(data);
            data->push_back('@');

            // get a character from the file
            while (infile.Get(ch))
            {

                // if we've reached a new comment line, put the character back and stop
                if (ch == '>')
                {
                    infile.UnGet();
                    break;
                }

                // skip whitespace
                if (isspace(ch))
                    continue;

                // substitute gap character
                if (ch == '.')
                    ch = '-';
                if (stripGaps && ch == '-')
                    continue;

                // check for known characters
                if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')))
                {
                    cerr << "ERROR: Unknown character encountered: " << ch
                         << endl;
                    exit(1);
                }

                // everything's ok so far, so just store this character.
                if (ch >= 'a' && ch <= 'z')
                {
                    ch = ch - 'a' + 'A';
                }	//change to upper case. fixed by Liu Yongchao, May 21, 2010

                data->push_back(ch);
                ++length;
            }

            // sequence must contain data in order to be valid
            isValid = length > 0;
            if (!isValid)
            {
                delete data;
                data = NULL;
            }
        }
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::Sequence()
    //
    // Constructor.  Builds a sequence from existing data.  Note
    // that the data must use one-based indexing where data[0] should
    // be set to '@'.
    /////////////////////////////////////////////////////////////////

    Sequence(SafeVector<char> *data, string header, int length,
             int sequenceLabel, int inputLabel) :
        isValid(data != NULL), header(header), data(data), length(length), sequenceLabel(
            sequenceLabel), inputLabel(inputLabel)
    {
        assert(data);
        assert((*data)[0] == '@');
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::Sequence()
    //
    // Destructor.  Release allocated memory.
    /////////////////////////////////////////////////////////////////

    ~Sequence()
    {
        if (data)
        {
            assert(isValid);
            delete data;
            data = NULL;
            isValid = false;
        }
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetHeader()
    //
    // Return the string comment associated with this sequence.
    /////////////////////////////////////////////////////////////////

    string GetHeader() const
    {
        return header;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetName()
    //
    // Return the first word of the string comment associated with this sequence.
    /////////////////////////////////////////////////////////////////

    string GetName() const
    {
        char name[1024];
        sscanf(header.c_str(), "%s", name);
        return string(name);
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetDataPtr()
    //
    // Return the iterator to data associated with this sequence.
    /////////////////////////////////////////////////////////////////

    SafeVector<char>::iterator GetDataPtr()
    {
        assert(isValid);
        assert(data);
        return data->begin();
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetPosition()
    //
    // Return the character at position i.  Recall that the character
    // data is stored with one-based indexing.
    /////////////////////////////////////////////////////////////////

    char GetPosition(int i) const
    {
        assert(isValid);
        assert(data);
        assert(i >= 1 && i <= length);
        return (*data)[i];
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::SetLabel()
    //
    // Sets the sequence label to i.
    /////////////////////////////////////////////////////////////////

    void SetLabel(int i)
    {
        assert(isValid);
        sequenceLabel = i;
        inputLabel = i;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::SetSortLabel()
    //
    // Sets the sequence sorting label to i.
    /////////////////////////////////////////////////////////////////

    void SetSortLabel(int i)
    {
        assert(isValid);
        sequenceLabel = i;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetLabel()
    //
    // Retrieves the input label.
    /////////////////////////////////////////////////////////////////

    int GetLabel() const
    {
        assert(isValid);
        return inputLabel;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetSortLabel()
    //
    // Retrieves the sorting label.
    /////////////////////////////////////////////////////////////////

    int GetSortLabel() const
    {
        assert(isValid);
        return sequenceLabel;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::Fail()
    //
    // Checks to see if the sequence successfully loaded.
    /////////////////////////////////////////////////////////////////

    bool Fail() const
    {
        return !isValid;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::Length()
    //
    // Returns the length of the sequence.
    /////////////////////////////////////////////////////////////////

    int GetLength() const
    {
        assert(isValid);
        assert(data);
        return length;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::WriteMFA()
    //
    // Writes the sequence to outfile in MFA format.  Uses numColumns
    // columns per line.  If useIndex is set to false, then the
    // header is printed as normal, but if useIndex is true, then
    // ">S###" is printed where ### represents the sequence label.
    /////////////////////////////////////////////////////////////////

    void WriteMFA(ostream &outfile, int numColumns,
                  bool useIndex = false) const
    {
        assert(isValid);
        assert(data);
        assert(!outfile.fail());

        // print out heading
        if (useIndex)
            outfile << ">S" << GetLabel() << endl;
        else
            outfile << ">" << header << endl;

        // print out character data
        int ct = 1;
        for (; ct <= length; ct++)
        {
            outfile << (*data)[ct];
            if (ct % numColumns == 0)
                outfile << endl;
        }
        if ((ct - 1) % numColumns != 0)
            outfile << endl;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::Clone()
    //
    // Returns a new deep copy of the seqeuence.
    /////////////////////////////////////////////////////////////////

    Sequence *Clone() const
    {
        Sequence *ret = new Sequence();
        assert(ret);

        ret->isValid = isValid;
        ret->header = header;
        ret->data = new SafeVector<char>;
        assert(ret->data);
        *(ret->data) = *data;
        ret->length = length;
        ret->sequenceLabel = sequenceLabel;
        ret->inputLabel = inputLabel;

        return ret;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetRange()
    //
    // Returns a new sequence object consisting of a range of
    // characters from the current seuquence.
    /////////////////////////////////////////////////////////////////

    Sequence *GetRange(int start, int end) const
    {
        Sequence *ret = new Sequence();
        assert(ret);

        assert(start >= 1 && start <= length);
        assert(end >= 1 && end <= length);
        assert(start <= end);

        ret->isValid = isValid;
        ret->header = header;
        ret->data = new SafeVector<char>;
        assert(ret->data);
        ret->data->push_back('@');
        for (int i = start; i <= end; i++)
            ret->data->push_back((*data)[i]);
        ret->length = end - start + 1;
        ret->sequenceLabel = sequenceLabel;
        ret->inputLabel = inputLabel;

        return ret;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::AddGaps()
    //
    // Given an SafeVector<char> containing the skeleton for an
    // alignment and the identity of the current character, this
    // routine will create a new sequence with all necesssary gaps added.
    // For instance,
    //    alignment = "XXXBBYYYBBYYXX"
    //    id = 'X'
    // will perform the transformation
    //    "ATGCAGTCA" --> "ATGCC---GT--CA"
    //                    (XXXBBYYYBBYYXX)
    /////////////////////////////////////////////////////////////////

    Sequence *AddGaps(SafeVector<char> *alignment, char id)
    {
        Sequence *ret = new Sequence();
        assert(ret);

        ret->isValid = isValid;
        ret->header = header;
        ret->data = new SafeVector<char>;
        assert(ret->data);
        ret->length = (int) alignment->size();
        ret->sequenceLabel = sequenceLabel;
        ret->inputLabel = inputLabel;
        ret->data->push_back('@');

        SafeVector<char>::iterator dataIter = data->begin() + 1;
        for (SafeVector<char>::iterator iter = alignment->begin();
                iter != alignment->end(); ++iter)
        {
            if (*iter == 'B' || *iter == id)
            {
                ret->data->push_back(*dataIter);
                ++dataIter;
            }
            else
                ret->data->push_back('-');
        }

        return ret;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetString()
    //
    // Returns the sequence as a string with gaps removed.
    /////////////////////////////////////////////////////////////////

    string GetString()
    {
        string s = "";
        for (int i = 1; i <= length; i++)
        {
            // if ((*data)[i] != '-')
             s += (*data)[i];
        }
        return s;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetMapping()
    //
    // Returns a SafeVector<int> containing the indices of every
    // character in the sequence.  For instance, if the data is
    // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}.
    /////////////////////////////////////////////////////////////////

    SafeVector<int> *GetMapping() const
    {
        SafeVector<int> *ret = new SafeVector<int>(1, 0);
        for (int i = 1; i <= length; i++)
        {
            if ((*data)[i] != '-')
                ret->push_back(i);
        }
        return ret;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::Highlight()
    //
    // Changes all positions with score >= cutoff to upper case and
    // all positions with score < cutoff to lower case.
    /////////////////////////////////////////////////////////////////

    void Highlight(const SafeVector<float> &scores, const float cutoff)
    {
        for (int i = 1; i <= length; i++)
        {
            if (scores[i - 1] >= cutoff)
                (*data)[i] = toupper((*data)[i]);
            else
                (*data)[i] = tolower((*data)[i]);
        }
    }
};

#endif
